#!/usr/bin/python3

"""
This script reads JSON files for different features (permissions, system calls, etc.)
with data on a set of apps, and reads the features' weights from a trained model, and
makes a dataset with the most heavily weighted features of each type.

The output data format is as follows:
{"features": ["ANDROID.PERMISSION.READ_PHONE_STATE", "java/security/Signature",...],
 "apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""

from configparser import ConfigParser
import json

__author__='mwleeds'

def main():
    config = ConfigParser()
    config.read('config.ini')
    FEATURES = config.get('AMA', 'FEATURES').split(',')
    TOP_N_FEATURES = config.getint('AMA', 'TOP_N_FEATURES')
    INCLUDE_DATES = config.getboolean('AMA', 'INCLUDE_DATES')

    all_features = [] # list of strings naming each feature used in the combined dataset
    app_feature_map = {} # mapping from android app names to lists of features
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    for feature in FEATURES:
        with open(feature + '_weights.json') as weights:
            feature_weights = json.load(weights)
        print('Found ' + str(len(feature_weights)) + ' sets of weights for ' + feature)
        # no need to look at benign weights; they're complementary
        malicious_weights = [weight[0] for weight in feature_weights]
        malicious_indices = sorted(range(len(malicious_weights)), key=lambda k: malicious_weights[k], reverse=True)
        with open('app_' + feature + '_vectors.json') as vectors:
            feature_data = json.load(vectors)
        feature_names = feature_data['features']
        print('Selecting ' + str(TOP_N_FEATURES) + ' top features of ' + str(len(feature_names)))
        for i in range(min(int(len(malicious_indices) / 2), int(TOP_N_FEATURES / 2))):
            index = malicious_indices[i]
            all_features.append(feature_names[index])
        for i in range(min(int(len(malicious_indices) / 2), int(TOP_N_FEATURES / 2))):
            index = malicious_indices[-i]
            all_features.append(feature_names[index])
    # The date feature has equal numbers of apps in each range to avoid it
    # being used as a feature directly, so only use those apps
    if INCLUDE_DATES:
        with open('app_date_vectors.json') as vectors:
            feature_data = json.load(vectors)
        date_buckets = feature_data['features']
        all_features += date_buckets
        date_apps = feature_data['apps']
        for app in date_apps:
            if app not in app_malicious_map:
                app_malicious_map[app] = date_apps[app]['malicious']
            if app not in app_feature_map:
                app_feature_map[app] = []
            for bucket in date_buckets:
                index = date_buckets.index(bucket)
                if date_apps[app]['vector'][index] == 1:
                    app_feature_map[app].append(bucket)
    for feature in FEATURES:
        with open('app_' + feature + '_vectors.json') as vectors:
            feature_data = json.load(vectors)
        feature_names = feature_data['features']
        feature_apps = feature_data['apps']
        print('Found ' + str(len(feature_apps)) + ' apps for ' + feature)
        for app in feature_apps:
            if INCLUDE_DATES and app not in date_apps:
                continue
            if app not in app_malicious_map:
                app_malicious_map[app] = feature_apps[app]['malicious']
            if app not in app_feature_map:
                app_feature_map[app] = []
            for feature_name in all_features:
                if feature_name in feature_names:
                    index = feature_names.index(feature_name)
                    if feature_apps[app]['vector'][index] == 1:
                        app_feature_map[app].append(feature_name)
    all_apps = {} # mapping combining app_feature_map and app_malicious_map using bits
    for app_name in app_feature_map:
        bit_vector = [1 if p in app_feature_map[app_name] else 0 for p in all_features]
        all_apps[app_name] = {'vector': bit_vector, 'malicious': app_malicious_map[app_name]}
    with open('app_feature_vectors.json', 'w') as outfile:
        json.dump({'features': all_features, 'apps': all_apps}, outfile)
    print('Wrote data on ' + str(len(all_features)) + ' features and ' + str(len(all_apps)) + ' apps to a file.')

if __name__=='__main__':
    main()
